Data loading and preprocessing¶

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('phonespecs.csv')
print(len(df), "phones read")
print()

# Set numeric data types and find rows with one (or more) NaN in numeric columns
numeric_cols = df.columns[2:10]
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')
valid_number_rows = ~df[numeric_cols].isna().any(axis=1)

#############################################################################################
# Keep only phones from brands with sufficiently many phones and also released phone recently

counts = df.groupby(['Brand']).Brand.count()              # Count number of phones for each brand
major_brand_by_counts = counts[counts >= counts['Google']] # Brands with #phones as many as Pixel

mostrecent = df.groupby(['Brand'], sort=False).Released.max() # Find the most recent release of brand
major_brand_by_recent = mostrecent[mostrecent >= 2021]    # Keep only brands that released after 2020

major_brand_rows = df.Brand.isin(major_brand_by_counts.index) \
                 & df.Brand.isin(major_brand_by_recent.index)

print("Keeping only phones from:", ', '.join(["{} ({})".format(b, c) for b, c in
        counts[major_brand_by_counts.index.intersection(major_brand_by_recent.index)].iteritems()]))
print()

rows = (valid_number_rows & major_brand_rows)

data = (df[numeric_cols])[rows].to_numpy()
labels = np.array(df['Model'].to_numpy()[rows])
brands = np.array(df['Brand'].to_numpy()[rows])

print("Total", len(data), "rows of", list(numeric_cols), "phone specs collected")
16614 phones read

Keeping only phones from: Alcatel (395), Apple (402), Archos (69), Asus (363), Coolpad (154), Google (68), Huawei (1153), Kyocera (83), LG (1243), Lenovo (257), Meizu (217), Motorola (762), Nokia (470), OnePlus (138), Oppo (849), Samsung (2389), Sharp (136), Sony (363), T-Mobile (98), TCL (134), Vivo (666), Wiko (91), Xiaomi (759), ZTE (648)

Total 11239 rows of ['Released', 'Width', 'Height', 'Depth', 'Weight', 'Display', 'Ratio', 'Battery'] phone specs collected

Remove phones that will interfere with analysis¶

Phones made before the first iPhone¶

  • The first iPhone was released in 2007
  • The first Android device (HTC Dream) was released in 2008
In [2]:
index = np.nonzero(np.core.defchararray.find(labels.astype('str'), "iPhone 1st")!=-1)[0]

rows = np.where(data[:,0] >= data[index,0])[0]

data = data[rows]
labels = labels[rows]
brands = brands[rows]

print(len(data), "rows remain after removing phones before iPhone 1st")
11130 rows remain after removing phones before iPhone 1st

Variants of the same phone¶

  • Consider two phones as variants of the same phone if they share the same specs and have similar model names
In [3]:
import editdistance

def has_similar_names(i, j):
    if brands[i] != brands[j]:
        return False
    l = min(len(labels[i]), len(labels[j]))
    dist = editdistance.eval(labels[i], labels[j])
    return dist < l//2

removed = np.full((len(data)), False)

order = labels.argsort()
data = data[order]
labels = labels[order]
brands = brands[order]

for i in range(len(data)):
    if removed[i]:
        continue
    for j in range(i+1, min(i+1000, len(data))):
        if removed[j]:
            continue
        if (data[i] == data[j]).all() and has_similar_names(i, j):
            removed[j] = True

data = data[np.where(~removed)]
labels = labels[np.where(~removed)]
brands = brands[np.where(~removed)]

print(len(data), "rows remain after removing variants of the same phone")
5519 rows remain after removing variants of the same phone

Phones of screen ratio smaller than 1.0 (width > height)¶

In [4]:
cutoff_ratio = 1.0

good_ratio_rows = np.where(data[:,6] >= cutoff_ratio)[0]

data = data[good_ratio_rows]
labels = labels[good_ratio_rows]
brands = brands[good_ratio_rows]

print(len(data), "rows remain after removing phones with larger width than height")
5407 rows remain after removing phones with larger width than height

Tablets and foldables¶

Phones of overly large screen size (>=7.5in)

In [5]:
large_scr_rows = np.where(data[:,5] >= 7.5)
for b, m, s, y in zip(brands[large_scr_rows], labels[large_scr_rows],
                      data[large_scr_rows, 5][0], data[large_scr_rows, 0][0]):
    print("{:7} {:70} {:5.2f} in {:5}".format(b, m, s, int(y)))
Xiaomi  Mi Mix Alpha 5G Dual SIM TD-LTE CN 512GB                                7.92 in  2020
Xiaomi  Mi Mix Fold 2021 5G Ceramic Special Edition Dual SIM TD-LTE CN 512GB    8.01 in  2021
Xiaomi  Mi Mix Fold 2021 5G Premium Edition Dual SIM TD-LTE CN 256GB            8.01 in  2021

Phones of screen size between 6.9in and 7.5in

In [6]:
large_scr_rows = np.where((data[:,5] >= 6.9) & (data[:,5] < 7.5))

_data = np.stack((brands[large_scr_rows], labels[large_scr_rows], data[large_scr_rows, 5][0],
                  data[large_scr_rows, 4][0], data[large_scr_rows, 0][0])).transpose()
_data = sorted(_data, key=lambda x: x[3])

for b, m, s, w, y in _data:
    print("{:10} {:70} {:5.2f}in {:4}g {}".format(b, m, s, int(w), int(y)))
ZTE        Axon 30 5G Premium Edition Dual SIM TD-LTE CN 128GB A2322               6.92in  189g 2021
Huawei     P50 Pocket 4G Premium Art Edition Dual SIM TD-LTE CN 512GB BAL-AL00     6.90in  190g 2021
Huawei     P50 Pocket 4G Premium Edition Global Dual SIM TD-LTE 512GB BAL-L49      6.90in  190g 2022
ZTE        Axon 20 4G Global Dual SIM TD-LTE 128GB A2121E                          6.92in  198g 2020
ZTE        Axon 20 5G Global Dual SIM TD-LTE 128GB A2121G                          6.92in  198g 2021
ZTE        Axon 20 5G Standard Edition Dual SIM TD-LTE CN 128GB A2121 / A20        6.92in  198g 2020
Huawei     Enjoy 9 Max Dual SIM TD-LTE CN ARS-AL00 128GB / Changxiang Max          7.12in  210g 2018
Huawei     Honor 8X Max 4G+ Standard Edition Dual SIM TD-LTE CN 128GB ARE-TL00     7.12in  210g 2018
Huawei     Honor 8X Max Premium Edition Dual SIM TD-LTE CN 64GB ARE-AL10           7.12in  210g 2018
Huawei     Y Max Dual SIM TD-LTE APAC ARS-LX2 / Honor 8X Max ARS-L22               7.12in  210g 2018
Xiaomi     Mi Max 3 Dual SIM TD-LTE 128GB M1804E4A / M1804E4C                      6.90in  221g 2018
ZTE        Rakuten BIG 5G TD-LTE JP                                                6.92in  227g 2020
Huawei     Honor Note 10 Premium Edition Dual SIM TD-LTE CN RVL-AL09 128GB         6.95in  230g 2018
Huawei     Mate 20 X Dual SIM TD-LTE CN 128GB EVR-AL00                             7.17in  232g 2018
Huawei     Mate 20 X Global Dual SIM TD-LTE 128GB EVR-L29                          7.17in  232g 2019
Xiaomi     Black Shark 3 Pro 5G Premium Edition Dual SIM TD-LTE CN 256GB MBU-A0    7.09in  253g 2020
Lenovo     Legion Phone 2 Pro 5G Premium Edition Dual SIM TD-LTE CN 256GB L70081   6.92in  262g 2021
  • Foldable phones have screen size >= 7.5
  • Phones with screen size between 6.9 and 7.5 but weighs more than 210 are tablets
In [7]:
remove = (data[:,5] >= 7.5) | ((data[:,5] >= 6.9)&(data[:,5] < 7.5)&(data[:,4] > 210))

data = data[np.where(~remove)]
labels = labels[np.where(~remove)]
brands = brands[np.where(~remove)]

print(len(data), "rows remain after removing tablets and foldables")
5397 rows remain after removing tablets and foldables

Longitudinal analysis¶

General phone specs¶

In [8]:
years = list(range(int(data[:,0].min()), int(data[:,0].max())+1))

num_phones = [np.where(data[:,0]==y)[0].shape[0] for y in years]

avg_weight = [data[np.where(data[:,0]==y),4][0].mean() for y in years]
avg_weight_error = [data[np.where(data[:,0]==y),4][0].std() for y in years]

avg_scr_size = [data[np.where(data[:,0]==y),5][0].mean() for y in years]
avg_scr_size_error = [data[np.where(data[:,0]==y),5][0].std() for y in years]

avg_scr_ratio = [data[np.where(data[:,0]==y),6][0].mean() for y in years]
avg_scr_ratio_error = [data[np.where(data[:,0]==y),6][0].std() for y in years]

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(17, 8))
plt.subplots_adjust(hspace=0.35)

ax1.bar(years, num_phones)
ax1.set_title('Number of Phones Released Each Year')

ax2.errorbar(years, avg_weight, yerr=avg_weight_error, capsize=7)
ax2.set_ylim([0, 230])
ax2.set_title('Average Weight (g)')

ax3.errorbar(years, avg_scr_size, yerr=avg_scr_size_error, capsize=7)
ax3.set_ylim([0, 8])
ax3.set_title('Average Screen Diagonal Length (inches)')

ax4.errorbar(years, avg_scr_ratio, yerr=avg_scr_ratio_error, capsize=7)
ax4.set_title('Average Screen Ratio')

plt.show()

Distributions of screen size and ratio¶

In [9]:
fig, (ax1, ax2) = plt.subplots(2, figsize=(17, 11))
plt.subplots_adjust(hspace=0)

disp_size_by_year = [data[np.where(data[:,0]==y),5][0] for y in years]

ax1.hist(data[np.isin(data[:,0], years),5], bins=100)
ax2.hist(disp_size_by_year, bins=100, histtype='bar', stacked=True)
ax2.set_xlabel('Display Size (inch)')
ax2.legend(years)
ax1.set_ylabel('Distribution')
ax2.set_ylabel('Distribution')

plt.figure(figsize=(16, 5))

scr_ratio_by_year = [data[np.where(data[:,0]==y),6][0] for y in years]

plt.hist(scr_ratio_by_year, histtype='bar', stacked=True, orientation="horizontal")
plt.ylabel('Screen Ratio')
plt.xlabel('Distribution')
plt.legend(years, loc=(1.01,0.1))

plt.show()

Graph-based analysis¶

Construct adjacency matrix¶

In [10]:
# Set threshold for establishing an edge between two models:
#  Released(y), Width(mm), Height(mm), Depth(mm), Weight(g), Display(mm), Ratio, Battery

thresholds = [None, 1, 1, 1, None, 0.5, 0.1, None]

adj = np.ones((len(data), len(data)))
np.fill_diagonal(adj, 0)
for i, j in [(i, j) for i in range(len(data)) for j in range(i+1, len(data))]:
    for c, (feat_i, feat_j) in enumerate(zip(data[i,:], data[j,:])):
        if thresholds[c] == None:
            continue
        if abs(feat_i - feat_j) > thresholds[c]:
            adj[i, j] = adj[j, i] = 0

Visually examine the adjacency matrix¶

In [11]:
plt.figure(figsize=(18,18))
plt.imshow(adj)
plt.show()

Create the graph object from the adjacency matrix¶

In [12]:
import palsgraph

G = palsgraph.make_graph(adj, labels=list(labels), show_singletons=False)

Perform community discovery¶

In [13]:
import networkx as nx

comp = nx.algorithms.community.centrality.girvan_newman(G)

Display results¶

In [14]:
from itertools import islice
from matplotlib import patheffects

label2scrsize = dict()
for i, l in enumerate(labels):
    label2scrsize[l] = "{:.3}".format(data[i,5])

communities = next(iter(comp))

# Find optimal positions for displaying the communities in the graph
pos = palsgraph.getpos(G, communities)

# Generate a colormap
color_map = palsgraph.gen_colormap(G, communities)

# Draw graph
plt.figure(figsize=(17, 15))
nx.draw(G, pos=pos, node_color=color_map, edge_color='grey', with_labels=False)
    
# Label each community with its index in comm
for index, comm in enumerate(communities):
    avg_pos = np.array([pos[c] for c in comm]).mean(axis=0)
    txt = plt.text(avg_pos[0], avg_pos[1], index, fontsize=14)
    txt.set_path_effects([patheffects.withStroke(linewidth=3, foreground='w')])

Colored with brands/years instead¶

In [15]:
from matplotlib.patches import Patch

### Codes for generating legends

def find_matching_row(color_map, color):
    for i in range(len(color_map)):
        if (color_map[i] == color).all():
            return(i)

def build_legends(G, labels, color_map, legend_names):
    colors = np.unique(np.array(color_map), axis=0)
    handles = []
    legend_tuples = []
    for i in range(len(colors)):
        label = list(G.nodes)[find_matching_row(color_map, colors[i])]
        name = legend_names[np.where(labels==label)[0]][0]
        legend_tuples.append((colors[i], name))
    legend_tuples = sorted(legend_tuples, key=lambda x: x[1])
    for c, l in legend_tuples:
        handles.append(Patch(color=c, label=l))
    return handles

### Plot graph colored by brand
    
# Generate a colormap
color_groups = [labels[np.where(brands==brand)] for brand in np.unique(brands)]
color_map = palsgraph.gen_colormap(G, color_groups)

# Draw graph
plt.figure(figsize=(17, 15))
nx.draw(G, pos=pos, node_color=color_map, edge_color='grey', with_labels=False)
plt.legend(handles=build_legends(G,labels,color_map,brands), fontsize=20, ncol=6, loc='upper center')
    
# Label each community with its average display size
for comm in communities:
    avg_disp_size = "{:.2f}".format(np.mean([float(label2scrsize[c]) for c in comm]))
    avg_pos = np.array([pos[c] for c in comm]).mean(axis=0)
    txt = plt.text(avg_pos[0], avg_pos[1], avg_disp_size, fontsize=14)
    txt.set_path_effects([patheffects.withStroke(linewidth=3, foreground='w')])

### Plot graph colored by year

# Generate a colormap
color_groups = [labels[np.where(data[:,0]==year)] for year in np.unique(data[:,0])]
color_map = palsgraph.gen_colormap(G, color_groups)

# Draw graph
plt.figure(figsize=(17, 15))
nx.draw(G, pos=pos, node_color=color_map, edge_color='grey', with_labels=False)
years = data[:,0]
plt.legend(handles=build_legends(G, labels, color_map, data[:,0].astype('int')), 
                                 fontsize=20, ncol=5, loc='upper center')
    
# Label each community with its display size
for comm in communities:
    avg_disp_size = "{:.2f}".format(np.mean([float(label2scrsize[c]) for c in comm]))
    avg_pos = np.array([pos[c] for c in comm]).mean(axis=0)
    txt = plt.text(avg_pos[0], avg_pos[1], avg_disp_size, fontsize=14)
    txt.set_path_effects([patheffects.withStroke(linewidth=3, foreground='w')])

Visually examine phones in selected communities¶

In [16]:
import matplotlib.image as mpimg
import random
import os

label2img = np.genfromtxt('./label2img.csv', delimiter=',', dtype='str')
label2img = dict([(label, img) for label, img in label2img])

num_phones_shown = 7
min_comm_size = 20
choice_communities = random.sample([comm for comm in communities if len(comm) >= min_comm_size], 5)

lbll2idx = dict()
for i, l in enumerate(labels):
    lbll2idx[l] = i
    
for comm in choice_communities:
    comm_size = len(comm)
    comm = random.sample(comm, num_phones_shown) # Choose num_phones_shown phones from community
    avg_disp_size = "{:.2f}".format(np.mean([float(data[lbll2idx[c],5]) for c in comm]))
    avg_width = "{:.2f}".format(np.mean([float(data[lbll2idx[c],1]) for c in comm]))
    avg_height = "{:.2f}".format(np.mean([float(data[lbll2idx[c],2]) for c in comm]))
    avg_depth = "{:.2f}".format(np.mean([float(data[lbll2idx[c],3]) for c in comm]))
    comm_desc = "Community size: {}, Averages: display {}, height {}, width {}, depth {}".format( \
        comm_size, avg_disp_size, avg_height, avg_width, avg_depth)
    fig, ax = plt.subplots(1, 1, figsize=(15, 0.05))
    ax.axis('off')
    ax.text(0, 0, comm_desc, bbox=None)
    fig, axs = plt.subplots(1, num_phones_shown, figsize=(18, 3),
                            gridspec_kw={'width_ratios': [1]*num_phones_shown})
    for (i, ax), c in zip(enumerate(axs), comm):
        img = mpimg.imread(os.path.join("images", label2img[c]))
        axs[i].axis('off')
        txt = axs[i].text(img.shape[1]/2, img.shape[0]/2, brands[lbll2idx[c]], ha='center', bbox=None)
        txt.set_path_effects([patheffects.withStroke(linewidth=5, foreground='w')])
        txt = axs[i].text(img.shape[1]/2, img.shape[0]/2+50, int(data[lbll2idx[c],0]),
                          ha='center', bbox=None)
        txt.set_path_effects([patheffects.withStroke(linewidth=5, foreground='w')])
        axs[i].imshow(img)

Examine the edges between the two large communities¶

In [17]:
def get_info(label):
    brand = "Maker: {}".format(brands[np.where(labels==label)[0]][0])
    year = "Released year: {}".format(int(data[np.where(labels==label)[0],0]))
    width = data[np.where(labels==label)[0],1][0]
    height = data[np.where(labels==label)[0],2][0]
    depth = data[np.where(labels==label)[0],3][0]
    dimension = "Dimension: {:.1f}mm x {:.1f}mm x {:.1f}mm".format(height, width, depth)
    weight = "Weight: {:.2f}g".format(data[np.where(labels==label)[0],4][0])
    displaysize = "Display: {}in".format(data[np.where(labels==label)[0],5][0])
    return (brand, year, dimension, weight, displaysize)

comm1 = communities[1]
comm2 = communities[4]
edges = [(node1, node2) for node1 in comm1 for node2 in comm2 if (node1, node2) in G.edges]
for n1, d1, n2, d2 in [(n1, data[np.where(labels==n1)[0]][0], n2, data[np.where(labels==n2)[0]][0])
                        for (n1, n2) in edges]:
    img1 = mpimg.imread(os.path.join("images", label2img[n1]))
    img2 = mpimg.imread(os.path.join("images", label2img[n2]))
    fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(15, 3), 
                                             gridspec_kw={'width_ratios': [1, 1.2, 1, 1.2]})
    ax1.axis('off')
    ax2.axis('off')
    ax3.axis('off')
    ax4.axis('off')
    ax1.imshow(img1)
    info = get_info(n1)
    ax2.text(0, 0.9, n1, bbox=None)
    ax2.text(0, 0.8, info[0], bbox=None)
    ax2.text(0, 0.7, info[1], bbox=None)
    ax2.text(0, 0.6, info[2], bbox=None)
    ax2.text(0, 0.5, info[3], bbox=None)
    ax2.text(0, 0.4, info[4], bbox=None)
    ax3.imshow(img2)
    info = get_info(n2)
    ax4.text(0, 0.9, n2, bbox=None)
    ax4.text(0, 0.8, info[0], bbox=None)
    ax4.text(0, 0.7, info[1], bbox=None)
    ax4.text(0, 0.6, info[2], bbox=None)
    ax4.text(0, 0.5, info[3], bbox=None)
    ax4.text(0, 0.4, info[4], bbox=None)

Save all variables¶

In [18]:
import dill

del(comp) # Remove comp since this datatype can't be saved
dill.dump_session('./phone_analysis.pkl')
In [ ]: